import sys
import numpy
import matplotlib
import pandas
import plotly
import seaborn
import sklearn
import scipy
print(f"""
Python - {sys.version}
Numpy - {numpy.__version__}
Matplotlib - {matplotlib.__version__}
Pandas - {pandas.__version__}
Plotly - {plotly.__version__}
Seaborn - {seaborn.__version__}
Sklearn - {sklearn.__version__}
Scipy - {scipy.__version__}
""")
Python - 3.8.5 (default, Jul 28 2020, 12:59:40) [GCC 9.3.0] Numpy - 1.19.2 Matplotlib - 3.3.2 Pandas - 1.1.3 Plotly - 4.11.0 Seaborn - 0.11.0 Sklearn - 0.23.2 Scipy - 1.5.3
# Importing the neccessary libraries
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
import plotly.tools as tls
import seaborn as sns
import sklearn
import scipy
import itertools
import warnings
warnings.filterwarnings("ignore")
#import the Dataset
df = pd.read_csv('/home/abhishek/Documents/Jupyter Notebooks/creditcardML-master/creditcard.csv')
df.head()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
print ("Rows : " ,df.shape[0])
print ("Columns : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values : ", df.isnull().sum().values.sum())
print ("\nUnique values : \n",df.nunique())
Rows : 284807 Columns : 31 Features : ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class'] Missing values : 0 Unique values : Time 124592 V1 275663 V2 275663 V3 275663 V4 275663 V5 275663 V6 275663 V7 275663 V8 275663 V9 275663 V10 275663 V11 275663 V12 275663 V13 275663 V14 275663 V15 275663 V16 275663 V17 275663 V18 275663 V19 275663 V20 275663 V21 275663 V22 275663 V23 275663 V24 275663 V25 275663 V26 275663 V27 275663 V28 275663 Amount 32767 Class 2 dtype: int64
#labels
lab = df["Class"].value_counts().keys().tolist()
#values
val = df["Class"].value_counts().values.tolist()
trace = go.Pie(labels = ['Valid Transactions','Fraud Transactions'] ,
values = val ,
marker = dict(colors = ["lime", "orangered"],
line = dict(color = "white",
width = 1.3)
),
rotation = 90,
hoverinfo = "label+value+text",
hole = .5
)
layout = go.Layout(dict(title = "<b>Fraud and Valid Transactions<b>",
height=400,
width=700,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
)
data = [trace]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
The Dataset contains 492 frauds out of 284,315 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.173% of all transactions.
We can see most of the transactions are valid transactions.It shows that it's real world dataset. If we use this dataframe as the base for our predictive models and analysis, our algorithms will probably overfit since it will "assume" that most transactions are not a fraud.
summary = df.describe().T.reset_index()
summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)
val_lst = [summary['feature'], summary['count'],
summary['mean'],summary['std'],
summary['min'], summary['25%'],
summary['50%'], summary['75%'], summary['max']]
trace = go.Table(header = dict(values = summary.columns.tolist(),
line = dict(color = ['#506784']),
fill = dict(color = ['#119DFF']),
),
cells = dict(values = val_lst,
line = dict(color = ['#506784']),
fill = dict(color = ["lightgrey",'#F5F8FF'])
),
columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "<b>Variable Summary<b>",height=850,width=1000))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)
In all the features only Time, Amount, and Class (fraud or not fraud) are made the most sense.The other 28 columns were transformed using what seems to be a PCA (Dimensionality Reduction technique) in order to protect user identities.
fraud = df[df['Class']==1]
valid = df[df['Class']==0]
fraud.Amount.describe()
count 492.000000 mean 122.211321 std 256.683288 min 0.000000 25% 1.000000 50% 9.250000 75% 105.890000 max 2125.870000 Name: Amount, dtype: float64
df.hist(figsize=(30,30),color='#4FC3F7')
plt.show()
By seeing the 1st graph which is of variable Time , we can see there are two peaks in the graph.
These as the time of the day like the peak is the day time when most people do the transactions and the depth is the night time when most people just sleeps. Because the data contains a credit card transaction for only two days, so there are two peaks for day time and one depth for one night time.
def scatter_matrix(df) :
df = df.sort_values(by = "Class" ,ascending = True)
classes = df["Class"].unique().tolist()
class_code = {classes[k] : k for k in range(2)}
color_vals = [class_code[cl] for cl in df["Class"]]
pl_colorscale = np.random.choice(['Bluered','Earth','Electric','Hot','Jet','Picnic','Portland','Rainbow','RdBu',
'Viridis','YlGnBu','YlOrRd','magma','inferno','Plasma','Blackbody'])
text = [df.loc[k,"Class"] for k in range(len(df))]
trace = go.Splom(dimensions = [ dict(label = f"{i}", values = df[f"{i}"]) for i in ["Time","Amount","V1","V2","V3"]],
text = text,
showupperhalf=False, # remove plots on diagonal
marker = dict(color = color_vals,
colorscale = ["#FFC107 ","#FF5722"],
size = 3,
showscale = False,
line = dict(width = .1,
color='rgb(230,230,230)'
)
)
)
axis = dict(showline = True,
zeroline = False,
gridcolor = "#fff",
ticklen = 4
)
layout = go.Layout(dict(title =
"Scatter plot",
autosize = False,
height = 1080,
width = 1080,
dragmode = "select",
hovermode = "closest",
plot_bgcolor = 'rgba(240,240,240, 0.95)',
xaxis1 = dict(axis),
yaxis1 = dict(axis),
xaxis2 = dict(axis),
yaxis2 = dict(axis),
xaxis3 = dict(axis),
yaxis3 = dict(axis),
)
)
data = [trace]
fig = go.Figure(data = data,layout = layout )
py.iplot(fig)
scatter_matrix(df)